Load needed libraries:

#install.packages("readr")
library("readr")
#install.packages('data.table')
library(data.table)
library(stringr)
#install.packages('ggplot2')
library(ggplot2)
library(gridExtra)
library(grid)
library(plyr)
library(purrr)
library(dplyr)
#install.packages('flextable')
library(flextable)
library(tibble)
library(ComplexHeatmap)
library(RColorBrewer)
library(ggplotify)

Alpha diversity

Load the data

First lets load the tables (.csv) generated in Qiita

Creating tables

Prepare data frames for healthy samples from American Gut Study subsample:

# Load all AGP data
AGP <- read.csv("~/Desktop/master_project/project_analysis/01_tidy_data/AGP_all.csv", header = TRUE, sep = ",")  

# Load healthy samples' table
all_healthy <- read.csv("~/Desktop/master_project/project_analysis/01_tidy_data/AGP_healthy.csv", header = TRUE, sep = ",")    

#all_healthy <- all_healthy[1:10]

all_healthy <- select(all_healthy, sample_id, shannon_entropy, chao1, menhinick, margalef, fisher_alpha, simpson, pielou_evenness, gini_index, strong, simpson, faith_pd, sex, race, host_age)


all_healthy$condition <- 'healthy'
all_healthy$qiita_study_id <- AGP$qiita_study_id[match(all_healthy$sample_id, AGP$sample_id)]

names(all_healthy)[names(all_healthy) == 'host_age'] <- 'age'

Load data for Inflammatory Bowel Disease data sets

all_IBD <- read.csv("~/Desktop/master_project/project_analysis/01_tidy_data/IBD_all.csv", header = TRUE, sep = ",")  

Load data for Fecal transplant - CDI with underlying IBD data set

trans_IBD_CDI <- read.csv("~/Desktop/master_project/project_analysis/01_tidy_data/all_trans_IBD_CDI.csv", header = TRUE, sep = ",")  

Load data for Changes following fecal microbial transplantation for recurrent CDI data set

C_diff_trans <- read.csv("~/Desktop/master_project/project_analysis/01_tidy_data/all_C_diff_trans.csv", header = TRUE, sep = ",")  

Load data for Longitudinal Chron’s disease study data set

CD_2 <- read.csv("~/Desktop/master_project/project_analysis/01_tidy_data/all_CD_2.csv", header = TRUE, sep = ",")  

Let’s define vector of names of the alpha diversity metrics that are going to be analysed:

metric <- c("shannon_entropy", "chao1", "menhinick", "margalef", "fisher_alpha", "simpson", "gini_index", "strong", "pielou_evenness", "faith_pd" ) 

Healthy vs IBD analysis

#merge two datasets
healthy_disease <-  rbind.fill(all_healthy, all_IBD)

healthy_disease$condition <- as.factor(healthy_disease$condition)
healthy_disease$condition <- relevel(healthy_disease$condition, "healthy")
# Sizes of each dataset
table(healthy_disease$condition)
## 
## healthy      CD      UC 
##     847      24      40
nrow(healthy_disease)
## [1] 911
head(healthy_disease)
##         sample_id shannon_entropy     chao1 menhinick  margalef fisher_alpha
## 1 10317.000002033        5.652189 203.40000  3.795524 19.963845     40.36435
## 2 10317.000003915        3.992920 222.03030  4.105362 21.604709     44.95118
## 3 10317.000013515        2.888589  87.31250  1.729933  9.024752     14.38912
## 4 10317.000014528        4.912853 140.83333  2.969287 15.588208     29.00431
## 5 10317.000015825        5.420265 142.25000  3.072567 16.135163     30.35482
## 6 10317.000015938        1.990738  70.46154  1.549193  8.067581     12.51383
##     simpson pielou_evenness gini_index    strong  faith_pd    sex      race
## 1 0.9552311       0.7632090  0.9993042 0.5656463 19.857051 female Caucasian
## 2 0.7977413       0.5246751  0.9995009 0.6729937 18.517975   male Caucasian
## 3 0.7276160       0.4706630  0.9998499 0.7485970  9.413326   male  Hispanic
## 4 0.9217813       0.6870763  0.9995303 0.5995362 15.162940 female Caucasian
## 5 0.9563307       0.7578467  0.9994371 0.5678992 13.546122 female Caucasian
## 6 0.4286436       0.3134449  0.9998760 0.7680000  9.802688 female     Other
##    age condition qiita_study_id age_at_diagnosis
## 1 37.7   healthy          10317               NA
## 2 43.6   healthy          10317               NA
## 3 29.1   healthy          10317               NA
## 4 48.7   healthy          10317               NA
## 5 53.2   healthy          10317               NA
## 6 54.6   healthy          10317               NA
  1. Distributions of metrics in disease datasets

Generate a vector of 10 random colors for histograms:

qual_col_pals <- brewer.pal.info[brewer.pal.info$category == 'qual',]
colors <- unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
histo_IBD <- vector('list', length(metric))

for (i in 1:length(metric)){
  histo_IBD[[i]] <- all_IBD %>% ggplot(aes(x = .data[[metric[i]]])) +
    geom_histogram(aes(y=..density..), colour="black", fill="white", bins=30)+
    geom_density(alpha=.2, fill=colors[i]) +
    xlab(label = metric[i]) + 
    ylab(label = "density")
}

grid.arrange(histo_IBD[[1]], histo_IBD[[2]],histo_IBD[[3]], histo_IBD[[4]],histo_IBD[[5]], histo_IBD[[6]],histo_IBD[[7]], histo_IBD[[8]],histo_IBD[[9]], histo_IBD[[10]], ncol=3, top = textGrob("Distributions of 1) Shannon's index  2) Chao1 3) Menhinick's index \n4) Margalef's index 5) Fisher's index 6) Simpson 7) Gini index 8) Strong's index 9) Pielou's evenness \nand 10) Faith's PD in IBD datasets", gp=gpar(fontsize=10,font=2)))

  1. Distributions of metrics in American Gut Project dataset
histo_healthy <- vector('list', length(metric))

for (i in 1:length(metric)){
  histo_healthy[[i]] <- all_healthy %>% ggplot(aes(x = .data[[metric[i]]])) +
    geom_histogram(aes(y=..density..), colour="black", fill="white", bins=30)+
    geom_density(alpha=.2, fill=colors[i]) +
    xlab(label = metric[i]) + 
    ylab(label = "density") 
}

grid.arrange(histo_healthy[[1]], histo_healthy[[2]], histo_healthy[[3]], histo_healthy[[4]], histo_healthy[[5]], histo_healthy[[6]], histo_healthy[[7]], histo_healthy[[8]], histo_healthy[[9]], histo_IBD[[10]], ncol=3, top = textGrob("Distributions of 1) Shannon's index  2) Chao1 3) Menhinick's index \n4) Margalef's index 5) Fisher's index 6) Simpson 7) Gini index 8) Strong's index 9) Pielou's evenness \nand 10) Faith's PD in healthy dataset", gp=gpar(fontsize=10,font=2))) 

Compaisons between two datasets

Density plots and Box plots

density <- vector('list', length(metric))
box <- vector('list', length(metric))



for (i in 1:length(metric)){
  mean_line <- healthy_disease %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
  
  density[[i]] <- healthy_disease %>% ggplot(aes(x = .data[[metric[i]]], color = condition)) +
    geom_density()+
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    labs(x = metric[i])

  box[[i]] <- healthy_disease %>% ggplot(aes(x = .data[[metric[i]]], color = condition)) +
    geom_boxplot() +
    labs(x = metric[i])
  
  if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index" &  metric[i] != "menhinick"){
    density [[i]] <- density [[i]] + 
     scale_x_continuous(trans = 'log10') +
     xlab(paste(metric[i], "(log10)", sep = " ")) 
    
    box [[i]] <- box [[i]] + 
     scale_x_continuous(trans = 'log10') +
     xlab(paste(metric[i], "(log10)", sep = " ")) 
  }
}

# Show plots
for (j in 1:length(metric)){
  grid.arrange(density[[j]], box[[j]], ncol=2, top = paste("Density and box plot comparing healthy vs diseases data for metric:", metric[j], sep=" "))
}

violin <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- healthy_disease %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
  plot_data <- healthy_disease %>% dplyr::group_by(condition) %>% dplyr::mutate(m = mean(.data[[metric[i]]])) 

  violin[[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(condition, -m), color = condition, fill = condition)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("")+
    #ggtitle("A Violin plot of distributions of alpha metrics in different conditions")+
    theme(legend.position="none") 
  
  if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
   violin [[i]] <- violin [[i]] + 
     scale_x_continuous(trans = 'log10') +
     xlab(paste(metric[i], "(log10)", sep = " ")) 
  }
}

# Show plots

#violin

grid.arrange(violin[[1]], violin[[2]],violin[[3]], violin[[4]],violin[[5]], violin[[6]],violin[[7]], violin[[8]],violin[[9]], violin[[10]], ncol=4, top = textGrob("Distributions of 1) Shannon's index  2) Chao1 3) Menhinick's index \n4) Margalef's index 5) Fisher's index 6) Simpson 7) Gini index 8) Strong's index 9) Pielou's evenness \nand 10) Faith's PD in IBD datasets", gp=gpar(fontsize=10,font=2))) 

Mann-Whitney-Wilcoxon Test

Non-parametric test (does not assume normal distribution)

test_helathy_IBD <- list()

for (i in 1:length(metric)){
  test_helathy_IBD[[i]] <- pairwise.wilcox.test(healthy_disease[[metric[i]]], healthy_disease$condition, p.adjust.method="none") %>% 
  broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
  test_helathy_IBD[[i]]$p.value <- round(test_helathy_IBD[[i]]$p.value, digits = 17)
}

tests <- do.call(what = rbind, args = test_helathy_IBD)

table1 <- tests %>% 
  add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different conditions")

table2 <- tests %>% 
  add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
  arrange(p.value)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different conditions")

Orered by name:

table1

Ordered by signifficance:

table2

An FDR-adjusted p-value (also called q-value) of 0.05 indicates that 5% of significant tests will result in false positives. In other words, an FDR of 5% means that, among all results called significant, only 5% of these are truly null.

# Do Wilcox test to see which parameters show significant differenc between healthy and unhealthy samples
wilcox_healthy_disease <- healthy_disease %>%
  summarise(Shannon = wilcox.test(shannon_entropy[condition == "healthy"], shannon_entropy[condition != "healthy"])$p.value,
            Chao1 = wilcox.test(chao1[condition == "healthy"], chao1[condition != "healthy"])$p.value,
            Menhinick = wilcox.test(menhinick[condition == "healthy"], menhinick[condition != "healthy"])$p.value,
            Margalef = wilcox.test(margalef[condition == "healthy"], margalef[condition != "healthy"])$p.value,
            Pielou = wilcox.test(pielou_evenness[condition == "healthy"], pielou_evenness[condition != "healthy"])$p.value,
            Fisher = wilcox.test(fisher_alpha[condition == "healthy"], fisher_alpha[condition != "healthy"])$p.value,
            Gini = wilcox.test(gini_index[condition == "healthy"], gini_index[condition != "healthy"])$p.value,
            Strong = wilcox.test(strong[condition == "healthy"], strong[condition != "healthy"])$p.value,
            Faith = wilcox.test(faith_pd[condition == "healthy"], faith_pd[condition != "healthy"])$p.value) 
wilcox_healthy_disease <- t(wilcox_healthy_disease)
colnames(wilcox_healthy_disease) <- c("p.value")
wilcox_healthy_disease <- data.frame(Alpha_Metric = row.names(wilcox_healthy_disease), wilcox_healthy_disease)
wilcox_healthy_disease$p.value <- round(wilcox_healthy_disease$p.value, digits = 17)

wilcox_healthy_disease %>%
  flextable() %>%
  bold(~ p.value < 0.05, 2) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different parameters between healthy and unhealthy samples")

Check the correlation between alpha diversity measures and condition with Kruskal-Wallis Test

test of the differentiation across multiple categories, analogous to one-way ANOVA

kruskal_results <- healthy_disease %>%
  summarise(Shannon = kruskal.test(healthy_disease$shannon_entropy ~ healthy_disease$condition)$p.value,
  Chao1 = kruskal.test(healthy_disease$chao1 ~ healthy_disease$condition)$p.value,
  Fisher = kruskal.test(healthy_disease$fisher_alpha ~ healthy_disease$condition)$p.value,
  Margalef =kruskal.test(healthy_disease$margalef ~ healthy_disease$condition)$p.value,
  Simpson = kruskal.test(healthy_disease$simpson ~ healthy_disease$condition)$p.value,
  Menhinick = kruskal.test(healthy_disease$menhinick ~ healthy_disease$condition)$p.value,
  Pielou = kruskal.test(healthy_disease$pielou_evenness ~ healthy_disease$condition)$p.value,
  Gini = kruskal.test(healthy_disease$gini_index ~ healthy_disease$condition)$p.value,
  Strong = kruskal.test(healthy_disease$strong ~ healthy_disease$condition)$p.value,
  Faith = kruskal.test(healthy_disease$faith_pd ~ healthy_disease$condition)$p.value)

kruskal_results <- as.data.frame(t(kruskal_results))
colnames(kruskal_results) <- c("p.value")
kruskal_results <- data.frame(Alpha_Metric = row.names(kruskal_results), kruskal_results)


kruskal_results %>% 
  flextable() %>%
  bold(~ p.value < 0.05, 2) %>%
  add_header_lines(values = "Results of the Kruskal-Wallis test for differentiation of different parameters across different conditions")

Heatmap representation of density distribution of healthy and disease data

for(x in 2:11){

#for(x in 2:10){
  heatmap_h <- densityHeatmap(as.matrix(all_healthy[,x]), ylab = colnames(all_healthy)[x], title ="healthy")
  heatmap_d <- densityHeatmap(as.matrix(all_IBD[,x]), ylab = colnames(all_IBD)[x], title = "disease")
  heatmap_hd <- densityHeatmap(as.matrix(healthy_disease[,x]), ylab = colnames(healthy_disease)[x], title = "healthy + disease")
  
  gl <- lapply(list(heatmap_h, heatmap_d, heatmap_hd), as.grob)
  grid.arrange(grobs=gl, ncol=3, clip=TRUE)
}

Longitudinal Chron’s disease analysis

table(CD_2$description, CD_2$condition)
##                           
##                            control crohns
##   father family 01-00010        27      0
##   father family 01-00011        20      0
##   father family 01-00012        54      0
##   father family 01-00013        27      0
##   fecal sample index pt 20       0     20
##   fecal sample index pt 21       0     25
##   fecal sample index pt 23       0     21
##   fecal sample index pt 24       0     26
##   fecal sample index pt 25       0     14
##   fecal sample index pt 26       0     16
##   fecal sample index pt 27       0     14
##   fecal sample index pt 31       0     10
##   fecal sample index pt 32       0     22
##   fecal sample index pt 33       0     20
##   fecal sample index pt 34       0      2
##   fecal sample index pt 35       0     14
##   fecal sample index pt 36       0      9
##   fecal sample index pt 37       0     22
##   fecal sample index pt 39       0     26
##   index family 01-00012          0     48
##   index family 01-00013          0     26
##   mother family 01-00010        25      0
##   mother family 01-00011        23      0
##   mother family 01-00012        57      0
##   mother family 01-00013        27      0
##   sibling family 01-00010       18      0
##   sibling family 01-00012       55      0
##   sibling family 01-00013       20      0
table(CD_2$condition)
## 
## control  crohns 
##     353     335
table(CD_2$surgery_and_ibd)
## 
##          control           crohns crohns (surgery) 
##              353              173              162
violin_CD_2a <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- CD_2 %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))

  violin_CD_2a [[i]] <- CD_2 %>% ggplot(aes(x = .data[[metric[i]]], y = condition, color = condition, fill = condition)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("") +
    theme(legend.position="none") 
  
    if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_CD_2a [[i]] <- violin_CD_2a [[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
    }

}

#plots for Shannon entropy
violin_CD_2a 
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

grid.arrange(violin_CD_2a[[1]], violin_CD_2a[[2]],violin_CD_2a[[3]], violin_CD_2a[[4]],violin_CD_2a[[5]], violin_CD_2a[[6]],violin_CD_2a[[7]], violin_CD_2a[[8]],violin_CD_2a[[9]],violin_CD_2a[[10]], ncol=4, top = textGrob("Distributions of 1) Shannon's index  2) Chao1 3) Menhinick's index \n4) Margalef's index 5) Fisher's index 6) Simpson 7) Gini index 8) Strong's index 9) Pielou's evenness \nand 10) Faith's PD in IBD datasets", gp=gpar(fontsize=10,font=2)))

violin_CD_2b <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- CD_2 %>% dplyr::group_by(surgery_and_ibd) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))

  violin_CD_2b [[i]] <- CD_2 %>% ggplot(aes(x = .data[[metric[i]]], y = surgery_and_ibd, color = surgery_and_ibd, fill = surgery_and_ibd)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = surgery_and_ibd), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("") +
    theme(legend.position="none") 
  
    if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_CD_2b [[i]] <- violin_CD_2b [[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
    }

}

#plots for Shannon entropy
violin_CD_2b 
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

violin_CD_2_surg <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- CD_2 %>% dplyr::group_by(surgery_type) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))
  plot_data <- CD_2 %>% dplyr::group_by(surgery_type) %>% dplyr::mutate(m = mean(.data[[metric[i]]])) 

  violin_CD_2_surg [[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(surgery_type, -m), color = surgery_type, fill = surgery_type)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = surgery_type), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("") +
    theme(legend.position="none") 
  
    if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_CD_2_surg [[i]] <- violin_CD_2_surg [[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
    }

}

#plots for Shannon entropy
violin_CD_2_surg 
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

Now let’s check the difference between this study and previous one measuring diversity in Crohn’s disease:

CD_1 <- healthy_disease[healthy_disease$condition != "UC",]

CD_2_surg <- CD_2
CD_2_surg$condition <- NULL
names(CD_2_surg)[names(CD_2_surg) == 'surgery_and_ibd'] <- 'condition'

#CD_check <- rbind.fill(CD_1, CD_2)
CD_check <- rbind.fill(CD_1, CD_2_surg)

CD_check$condition[CD_check$condition == "CD"] <-'CD_1'
CD_check$condition[CD_check$condition == "crohns"] <-'CD_2'
CD_check$condition[CD_check$condition == "crohns (surgery)"] <-'CD_2_surgery'
CD_check$condition[CD_check$condition == "healthy"] <-'control(AGP)'
CD_check$condition[CD_check$condition == "control"] <-'control_2'
violin_CD_check <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- CD_check %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))

  violin_CD_check [[i]] <- CD_check %>% ggplot(aes(x = .data[[metric[i]]], y = condition, color = condition, fill = condition)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    scale_y_discrete(limits=c("CD_2", "control_2", "CD_2_surgery", "CD_1", "control(AGP)")) +
    labs(x = metric[i])+
    ylab("")  +
    theme(legend.position="none") 

    if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_CD_check [[i]] <- violin_CD_check [[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
  }
}

#plots for Shannon entropy
violin_CD_check 
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

test_CD_1 <- list()
test_CD_2 <- list()

for (i in 1:length(metric)){
  test_CD_1[[i]] <- pairwise.wilcox.test(CD_1[[metric[i]]], CD_1$condition, p.adjust.method="none") %>% 
  broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
  test_CD_1[[i]]$p.value <- round(test_CD_1[[i]]$p.value, digits = 17)
  
  test_CD_2[[i]] <- pairwise.wilcox.test(CD_2[[metric[i]]], CD_2$surgery_and_ibd, p.adjust.method="none") %>% 
  broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
  test_CD_2[[i]]$p.value <- round(test_CD_2[[i]]$p.value, digits = 17)
}

tests1 <- do.call(what = rbind, args = test_CD_1)
tests2 <- do.call(what = rbind, args = test_CD_2)


table_CD_1 <- tests1 %>% 
  add_column(p.adjusted = p.adjust(tests1$p.value, "fdr"), .after='p.value') %>%
  arrange(p.value, parameter)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different conditions in CD_1 dataset")

table_CD_2 <- tests2 %>% 
  add_column(p.adjusted = p.adjust(tests2$p.value, "fdr"), .after='p.value') %>%
  arrange(p.value, parameter)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcoxon test for distributions of different conditions in CD_2 dataset")

Results of the Wilcoxon test for diversity distributions of healthy and CD samples in first study:

table_CD_1

Results of the Wilcoxon test for diversity distributions of healthy and CD samples in second (longitudinal) study:

table_CD_2
CD_check_w <- CD_check %>% filter(CD_check$condition != "control(AGP)" & CD_check$condition != "control_2" )

test_CD_3 <- list()

for (i in 1:length(metric)){
  test_CD_3[[i]] <- pairwise.wilcox.test(CD_check_w[[metric[i]]], CD_check_w$condition, p.adjust.method="none") %>% 
  broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
  test_CD_3[[i]]$p.value <- round(test_CD_3[[i]]$p.value, digits = 17)
}

tests <- do.call(what = rbind, args = test_CD_3)

table_CD_3 <- tests %>% 
  add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
  arrange(p.value, parameter)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different groups of Crhohn's disease patients")

table_CD_3

Fecal transplant data analysis

From the paper: “A recent study suggests significantly lower response of CDI to FMT in patients with underlying inflammatory bowel disease (IBD). We have also previously described a higher rate of recurrence of CDI following FMT in patients with CDI and underlying IBD”

colnames(trans_IBD_CDI)
##  [1] "sample_id"                   "shannon_entropy"            
##  [3] "chao1"                       "fisher_alpha"               
##  [5] "margalef"                    "gini_index"                 
##  [7] "menhinick"                   "strong"                     
##  [9] "simpson"                     "faith_pd"                   
## [11] "pielou_evenness"             "animations_subject"         
## [13] "sex"                         "age"                        
## [15] "body_mass_index"             "condition"                  
## [17] "day_since_fmt"               "donor_or_patient"           
## [19] "number_recurrence_after_fmt" "qiita_study_id"
table(trans_IBD_CDI$condition)
## 
##            CDI       CDI + CD       CDI + UC Not applicable 
##             39             16             12             28
table(trans_IBD_CDI$day_since_fmt)
## 
##       -1       28        7 NA-Donor  no_data 
##       23       18       22       30        2
trans_IBD_CDI_1 <- trans_IBD_CDI %>%
  filter(day_since_fmt != "no_data" ) %>%
  filter(!(donor_or_patient == "Donor" & condition=="CDI")) 

violin_trans <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- trans_IBD_CDI_1 %>% dplyr::group_by(condition, day_since_fmt) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))

  violin_trans[[i]] <- trans_IBD_CDI_1 %>% ggplot(aes(x = .data[[metric[i]]], y = condition, color = condition, fill = condition)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("") +
    facet_wrap(vars(day_since_fmt))+
    theme(legend.position="none") 
  
  if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_trans [[i]] <- violin_trans [[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
  }
}

#plots for Shannon entropy
violin_trans
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

cond <- c("CDI + UC", "CDI", "CDI + CD")
test_CDI_trans <- list()
table <- list()

for (i in 1:length(cond)){
  trans_IBD_CDI_2 <- trans_IBD_CDI_1 %>%
    filter(condition == cond[i])
  
  for (j in 1:length(metric)){
    test_CDI_trans[[j]] <- pairwise.wilcox.test(trans_IBD_CDI_2[[metric[j]]], trans_IBD_CDI_2$day_since_fmt, p.adjust.method="none") %>% 
    broom::tidy() %>% add_column(parameter = metric[j], .before='group1')
    test_CDI_trans[[j]]$p.value <- round(test_CDI_trans[[j]]$p.value, digits = 17)
  }
  
  tests <- do.call(what = rbind, args = test_CDI_trans)

  table <- tests %>%
    add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
    flextable() %>%
    bold(~ p.value < 0.05, 4) %>%
    bold(~ p.adjusted < 0.05, 5) %>%
    add_header_lines(values = paste("Results of the Wilcox test for condition:", cond[i], sep = " "))
   
 print(table)
 
 test_CDI_trans <- list()
}
## a flextable object.
## col_keys: `parameter`, `group1`, `group2`, `p.value`, `p.adjusted` 
## header has 2 row(s) 
## body has 30 row(s) 
## original dataset sample: 
##         parameter group1 group2    p.value p.adjusted
## 1 shannon_entropy     28     -1 0.20000000 0.40000000
## 2 shannon_entropy      7     -1 0.02857143 0.08571429
## 3 shannon_entropy      7     28 0.68571429 0.97959184
## 4           chao1     28     -1 0.02857143 0.08571429
## 5           chao1      7     -1 0.05714286 0.13186813
## a flextable object.
## col_keys: `parameter`, `group1`, `group2`, `p.value`, `p.adjusted` 
## header has 2 row(s) 
## body has 30 row(s) 
## original dataset sample: 
##         parameter group1 group2      p.value   p.adjusted
## 1 shannon_entropy     28     -1 2.048483e-03 4.727269e-03
## 2 shannon_entropy      7     -1 2.502252e-03 5.361969e-03
## 3 shannon_entropy      7     28 8.078046e-01 8.975606e-01
## 4           chao1     28     -1 1.713188e-05 8.565939e-05
## 5           chao1      7     -1 2.070886e-07 1.553165e-06
## a flextable object.
## col_keys: `parameter`, `group1`, `group2`, `p.value`, `p.adjusted` 
## header has 2 row(s) 
## body has 30 row(s) 
## original dataset sample: 
##         parameter group1 group2     p.value p.adjusted
## 1 shannon_entropy     28     -1 0.031746032  0.0952381
## 2 shannon_entropy      7     -1 0.030303030  0.0952381
## 3 shannon_entropy      7     28 1.000000000  1.0000000
## 4           chao1     28     -1 0.059327061  0.1618011
## 5           chao1      7     -1 0.007969413  0.0952381

Short- and Longterm changes after fecal transplantation for recurrent CDI

table(C_diff_trans$disease_state)
## 
## post-FMT  Pre-FMT 
##       91        4
table(C_diff_trans$day_since_fmt)
## 
## -18  -2  -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
##   1   1   1   1   2   3   2   2   2   3   2   3   3   2   3   3   2   3   3   2 
##  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  35  36  40  42 
##   3   3   2   3   2   3   2   2   1   3   2   1   1   1   1   1   1   1   1   2 
##  47  49  55  56  63  70  77  84 151 
##   1   2   1   2   2   2   2   2   1
table(C_diff_trans$animations_subject)
## 
## CD1 CD2 CD4 
##  36  23  36
C_diff_trans$animations_subject[C_diff_trans$animations_subject == "CD1"] <-'subject_1'
C_diff_trans$animations_subject[C_diff_trans$animations_subject == "CD2"] <-'subject_2'
C_diff_trans$animations_subject[C_diff_trans$animations_subject == "CD4"] <-'subject_3'
progression <- vector('list', length(metric))

for (i in 1:length(metric)){
  
  progression[[i]] <- C_diff_trans %>% ggplot(aes(x=day_since_fmt, y= .data[[metric[i]]], group=animations_subject)) +
    geom_line(aes(color=animations_subject))+
    geom_point(aes(color=animations_subject))+
    facet_wrap(vars(animations_subject), scale="free", ncol=2)
  
}

progression
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

AGP + IBD + transplant CDI+IBD

before_trans <- trans_IBD_CDI %>%
  filter(day_since_fmt != c("7","28", "no_data", "NA-Donor"),
         condition != "Not applicable")

#healthy_disease_2 <- rbind.fill(healthy_disease, before_trans, CD_2)
healthy_disease_2 <- rbind.fill(healthy_disease, before_trans)

# Sizes of each dataset
table(healthy_disease_2$condition)
## 
##       CD      CDI CDI + CD CDI + UC  healthy       UC 
##       24       30       11        9      847       40
nrow(healthy_disease)
## [1] 911
ggplot(healthy_disease_2, aes(x = faith_pd)) +
    geom_histogram(aes(y=..density..), colour="black", fill="white", bins=30) +
    geom_density (alpha=.2, fill="#009E73") +
    xlab(label = "faith_pd") + 
    ylab(label = "density") +
    scale_x_continuous(trans = 'log10') +
    facet_wrap(vars(condition))

*The values on y scale are densities instead of frequencies (what percentage of observations in a dataset fall between different values)

violin_all <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- healthy_disease_2 %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
  plot_data <- healthy_disease_2 %>% dplyr::group_by(condition) %>% dplyr::mutate(m = mean(.data[[metric[i]]])) 

  violin_all[[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(condition, -m), color = condition, fill = condition)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("")+
    ggtitle("A Violin plot of distributions of alpha metrics in different conditions")+
    theme(legend.position="none") 
  
  if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_all[[i]] <- violin_all[[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
  }
}

#plots for Shannon entropy
violin_all
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

Mann-Whitney-Wilcoxon Test

Non-parametric test (does not assume normal distribution)

test <- list()

for (i in 1:length(metric)){
  test[[i]] <- pairwise.wilcox.test(healthy_disease_2[[metric[i]]], healthy_disease_2$condition, p.adjust.method="none") %>% 
  broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
  test[[i]]$p.value <- round(test[[i]]$p.value, digits = 17)
}

tests <- do.call(what = rbind, args = test)

table1 <- tests %>% 
  add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
  arrange(p.value, parameter)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different conditions")

table2 <- tests %>% 
  add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
  arrange(parameter, group1)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different conditions")
#table1
#table2
# Do Wilcox test to see which parameters show significant differenc between healthy and unhealthy samples
wilcox_p_value <- healthy_disease_2 %>%
  summarise(Shannon = wilcox.test(shannon_entropy[condition == "healthy"], shannon_entropy[condition != "healthy"])$p.value,
            Chao1 = wilcox.test(chao1[condition == "healthy"], chao1[condition != "healthy"])$p.value,
            Menhinick = wilcox.test(menhinick[condition == "healthy"], menhinick[condition != "healthy"])$p.value,
            Margalef = wilcox.test(margalef[condition == "healthy"], margalef[condition != "healthy"])$p.value,
            Simpson = wilcox.test(simpson[condition == "healthy"], simpson[condition != "healthy"])$p.value,
            Fisher = wilcox.test(fisher_alpha[condition == "healthy"], fisher_alpha[condition != "healthy"])$p.value,
            Pielou = wilcox.test(pielou_evenness[condition == "healthy"], pielou_evenness[condition != "healthy"])$p.value,
            Gini = wilcox.test(gini_index[condition == "healthy"], gini_index[condition != "healthy"])$p.value,
            Strong = wilcox.test(strong[condition == "healthy"], strong[condition != "healthy"])$p.value,
            Faith = wilcox.test(faith_pd[condition == "healthy"], faith_pd[condition != "healthy"])$p.value) 
wilcox_p_value <- t(wilcox_p_value)
colnames(wilcox_p_value) <- c("p.value")
wilcox_p_value <- data.frame(Alpha_Metric = row.names(wilcox_p_value), wilcox_p_value)
wilcox_p_value$p.value <- round(wilcox_p_value$p.value, digits = 17)

wilcox_p_value %>%
  flextable() %>%
  bold(~ p.value < 0.05, 2) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different parameters between healthy and unhealthy samples")

If we add CD samples from longitudinal study:

all_data <- rbind.fill(healthy_disease, before_trans, CD_2)

table(all_data$condition)
## 
##       CD      CDI CDI + CD CDI + UC  control   crohns  healthy       UC 
##       24       30       11        9      353      335      847       40
all_data$condition[all_data$condition == "crohns"] <-'CD'
all_data$condition[all_data$condition == "control"] <-'healthy'
violin_all_b <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- all_data %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
  plot_data <- all_data %>% dplyr::group_by(condition) %>% dplyr::mutate(m = mean(.data[[metric[i]]])) 

  violin_all_b[[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(condition, -m), color = condition, fill = condition)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("")+
    ggtitle("A Violin plot of distributions of alpha metrics in different conditions")+
    theme(legend.position="none") 
  
  if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_all_b[[i]] <- violin_all_b[[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
  }
}

#plots for Shannon entropy
violin_all_b
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

Session information

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_ES.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=es_ES.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggplotify_0.1.0       RColorBrewer_1.1-3    ComplexHeatmap_2.10.0
##  [4] tibble_3.1.8          flextable_0.8.2       dplyr_1.0.10         
##  [7] purrr_0.3.5           plyr_1.8.7            gridExtra_2.3        
## [10] ggplot2_3.4.0         stringr_1.4.1         data.table_1.14.4    
## [13] readr_2.1.3          
## 
## loaded via a namespace (and not attached):
##  [1] tidyr_1.2.1         sass_0.4.2          jsonlite_1.8.3     
##  [4] foreach_1.5.2       bslib_0.4.1         assertthat_0.2.1   
##  [7] highr_0.9           stats4_4.1.2        yulab.utils_0.0.5  
## [10] yaml_2.3.6          gdtools_0.2.4       backports_1.4.1    
## [13] pillar_1.8.1        glue_1.6.2          uuid_1.1-0         
## [16] digest_0.6.30       colorspace_2.0-3    htmltools_0.5.3    
## [19] pkgconfig_2.0.3     GetoptLong_1.0.5    broom_1.0.1        
## [22] scales_1.2.1        officer_0.4.4       tzdb_0.3.0         
## [25] generics_0.1.3      farver_2.1.1        IRanges_2.28.0     
## [28] ellipsis_0.3.2      cachem_1.0.6        withr_2.5.0        
## [31] BiocGenerics_0.40.0 cli_3.4.1           magrittr_2.0.3     
## [34] crayon_1.5.2        evaluate_0.17       fansi_1.0.3        
## [37] doParallel_1.0.17   xml2_1.3.3          tools_4.1.2        
## [40] hms_1.1.2           GlobalOptions_0.1.2 lifecycle_1.0.3    
## [43] matrixStats_0.61.0  S4Vectors_0.32.4    munsell_0.5.0      
## [46] cluster_2.1.2       zip_2.2.0           compiler_4.1.2     
## [49] jquerylib_0.1.4     systemfonts_1.0.4   gridGraphics_0.5-1 
## [52] rlang_1.0.6         iterators_1.0.14    rstudioapi_0.14    
## [55] rjson_0.2.21        circlize_0.4.15     base64enc_0.1-3    
## [58] labeling_0.4.2      rmarkdown_2.17      gtable_0.3.1       
## [61] codetools_0.2-18    DBI_1.1.3           R6_2.5.1           
## [64] knitr_1.40          fastmap_1.1.0       utf8_1.2.2         
## [67] clue_0.3-62         shape_1.4.6         stringi_1.7.8      
## [70] parallel_4.1.2      Rcpp_1.0.8          vctrs_0.5.0        
## [73] png_0.1-7           tidyselect_1.2.0    xfun_0.37